#ifndef CUFFTDX_FFT_216_FP64_FWD_PTX_HPP
#define CUFFTDX_FFT_216_FP64_FWD_PTX_HPP



template<> __forceinline__ __device__ void cufftdx_private_function<561, double, 1>(cufftdx::detail::complex<double> *rmem, unsigned smem){

asm volatile (R"({
.reg .b32 r<17>;
.reg .f64 fd<307>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
mov.u32 r2, %12;
mad.lo.s32 r3, r1, 1728, r2;
mov.u32 r4, %tid.x;
add.f64 fd25, %20, %25;
add.f64 fd26, %15, fd25;
add.f64 fd27, %22, %27;
add.f64 fd28, %16, fd27;
mul.f64 fd29, fd25, 0d3FE0000000000000;
sub.f64 fd30, %15, fd29;
sub.f64 fd31, %22, %27;
mul.f64 fd32, fd31, 0d3FEBB67AE8584CAA;
add.f64 fd33, fd32, fd30;
sub.f64 fd34, fd30, fd32;
mul.f64 fd35, fd27, 0d3FE0000000000000;
sub.f64 fd36, %16, fd35;
sub.f64 fd37, %20, %25;
mul.f64 fd38, fd37, 0d3FEBB67AE8584CAA;
sub.f64 fd39, fd36, fd38;
add.f64 fd40, fd38, fd36;
add.f64 fd41, %23, %28;
add.f64 fd42, %17, fd41;
add.f64 fd43, %24, %29;
add.f64 fd44, %19, fd43;
mul.f64 fd45, fd41, 0d3FE0000000000000;
sub.f64 fd46, %17, fd45;
sub.f64 fd47, %24, %29;
mul.f64 fd48, fd47, 0d3FEBB67AE8584CAA;
add.f64 fd49, fd48, fd46;
sub.f64 fd50, fd46, fd48;
mul.f64 fd51, fd43, 0d3FE0000000000000;
sub.f64 fd52, %19, fd51;
sub.f64 fd53, %23, %28;
mul.f64 fd54, fd53, 0d3FEBB67AE8584CAA;
sub.f64 fd55, fd52, fd54;
add.f64 fd56, fd54, fd52;
mul.f64 fd57, fd49, 0d3FE0000000000000;
mul.f64 fd58, fd55, 0dBFEBB67AE8584CAA;
sub.f64 fd59, fd57, fd58;
mul.f64 fd60, fd55, 0d3FE0000000000000;
fma.rn.f64 fd61, fd49, 0dBFEBB67AE8584CAA, fd60;
mul.f64 fd62, fd50, 0dBFE0000000000000;
mul.f64 fd63, fd56, 0dBFEBB67AE8584CAA;
sub.f64 fd64, fd62, fd63;
mul.f64 fd65, fd56, 0dBFE0000000000000;
fma.rn.f64 fd66, fd50, 0dBFEBB67AE8584CAA, fd65;
add.f64 fd67, fd26, fd42;
add.f64 fd68, fd28, fd44;
sub.f64 fd69, fd26, fd42;
sub.f64 fd70, fd28, fd44;
add.f64 fd71, fd33, fd59;
add.f64 fd72, fd39, fd61;
sub.f64 fd73, fd33, fd59;
sub.f64 fd74, fd39, fd61;
add.f64 fd75, fd34, fd64;
add.f64 fd76, fd40, fd66;
sub.f64 fd77, fd34, fd64;
sub.f64 fd78, fd40, fd66;
mul.wide.u32 rd2, r4, 954437177;
shr.u64 rd3, rd2, 35;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 36;
sub.s32 r7, r4, r6;
mul.wide.u32 rd4, r7, 16;
mov.u64 rd5, %13;
add.s64 rd6, rd5, rd4;
ld.global.v2.f64 {fd79, fd80}, [rd6];
mul.f64 fd83, fd79, fd71;
mul.f64 fd84, fd80, fd72;
sub.f64 fd85, fd83, fd84;
mul.f64 fd86, fd79, fd72;
fma.rn.f64 fd87, fd80, fd71, fd86;
mul.f64 fd88, fd79, fd79;
mul.f64 fd89, fd80, fd80;
sub.f64 fd90, fd88, fd89;
mul.f64 fd91, fd80, fd79;
fma.rn.f64 fd92, fd80, fd79, fd91;
mul.f64 fd93, fd90, fd75;
mul.f64 fd94, fd92, fd76;
sub.f64 fd95, fd93, fd94;
mul.f64 fd96, fd90, fd76;
fma.rn.f64 fd97, fd92, fd75, fd96;
mul.f64 fd98, fd79, fd90;
mul.f64 fd99, fd80, fd92;
sub.f64 fd100, fd98, fd99;
mul.f64 fd101, fd79, fd92;
fma.rn.f64 fd102, fd80, fd90, fd101;
mul.f64 fd103, fd100, fd69;
mul.f64 fd104, fd102, fd70;
sub.f64 fd105, fd103, fd104;
mul.f64 fd106, fd100, fd70;
fma.rn.f64 fd107, fd102, fd69, fd106;
ld.global.v2.f64 {fd108, fd109}, [rd6+576];
mul.f64 fd112, fd108, fd73;
mul.f64 fd113, fd109, fd74;
sub.f64 fd114, fd112, fd113;
mul.f64 fd115, fd108, fd74;
fma.rn.f64 fd116, fd109, fd73, fd115;
mul.f64 fd117, fd79, fd108;
mul.f64 fd118, fd80, fd109;
sub.f64 fd119, fd117, fd118;
mul.f64 fd120, fd79, fd109;
fma.rn.f64 fd121, fd80, fd108, fd120;
mul.f64 fd122, fd119, fd77;
mul.f64 fd123, fd121, fd78;
sub.f64 fd124, fd122, fd123;
mul.f64 fd125, fd119, fd78;
fma.rn.f64 fd126, fd121, fd77, fd125;
mad.lo.s32 r8, r5, 1728, r3;
barrier.sync 0;
mad.lo.s32 r9, r7, 48, r8;
st.shared.v2.f64 [r9], {fd67, fd85};
st.shared.v2.f64 [r9+16], {fd95, fd105};
st.shared.v2.f64 [r9+32], {fd114, fd124};
barrier.sync 0;
mad.lo.s32 r10, r7, -40, r9;
ld.shared.f64 fd127, [r10];
ld.shared.f64 fd128, [r10+288];
ld.shared.f64 fd129, [r10+576];
ld.shared.f64 fd130, [r10+864];
ld.shared.f64 fd131, [r10+1152];
ld.shared.f64 fd132, [r10+1440];
barrier.sync 0;
st.shared.v2.f64 [r9], {fd68, fd87};
st.shared.v2.f64 [r9+16], {fd97, fd107};
st.shared.v2.f64 [r9+32], {fd116, fd126};
barrier.sync 0;
ld.shared.f64 fd133, [r10];
ld.shared.f64 fd134, [r10+288];
ld.shared.f64 fd135, [r10+576];
ld.shared.f64 fd136, [r10+864];
ld.shared.f64 fd137, [r10+1152];
ld.shared.f64 fd138, [r10+1440];
add.f64 fd139, fd129, fd131;
add.f64 fd140, fd127, fd139;
add.f64 fd141, fd135, fd137;
add.f64 fd142, fd133, fd141;
mul.f64 fd143, fd139, 0d3FE0000000000000;
sub.f64 fd144, fd127, fd143;
sub.f64 fd145, fd135, fd137;
mul.f64 fd146, fd145, 0d3FEBB67AE8584CAA;
add.f64 fd147, fd146, fd144;
sub.f64 fd148, fd144, fd146;
mul.f64 fd149, fd141, 0d3FE0000000000000;
sub.f64 fd150, fd133, fd149;
sub.f64 fd151, fd129, fd131;
mul.f64 fd152, fd151, 0d3FEBB67AE8584CAA;
sub.f64 fd153, fd150, fd152;
add.f64 fd154, fd152, fd150;
add.f64 fd155, fd130, fd132;
add.f64 fd156, fd128, fd155;
add.f64 fd157, fd136, fd138;
add.f64 fd158, fd134, fd157;
mul.f64 fd159, fd155, 0d3FE0000000000000;
sub.f64 fd160, fd128, fd159;
sub.f64 fd161, fd136, fd138;
mul.f64 fd162, fd161, 0d3FEBB67AE8584CAA;
add.f64 fd163, fd162, fd160;
sub.f64 fd164, fd160, fd162;
mul.f64 fd165, fd157, 0d3FE0000000000000;
sub.f64 fd166, fd134, fd165;
sub.f64 fd167, fd130, fd132;
mul.f64 fd168, fd167, 0d3FEBB67AE8584CAA;
sub.f64 fd169, fd166, fd168;
add.f64 fd170, fd168, fd166;
mul.f64 fd171, fd163, 0d3FE0000000000000;
mul.f64 fd172, fd169, 0dBFEBB67AE8584CAA;
sub.f64 fd173, fd171, fd172;
mul.f64 fd174, fd169, 0d3FE0000000000000;
fma.rn.f64 fd175, fd163, 0dBFEBB67AE8584CAA, fd174;
mul.f64 fd176, fd164, 0dBFE0000000000000;
mul.f64 fd177, fd170, 0dBFEBB67AE8584CAA;
sub.f64 fd178, fd176, fd177;
mul.f64 fd179, fd170, 0dBFE0000000000000;
fma.rn.f64 fd180, fd164, 0dBFEBB67AE8584CAA, fd179;
add.f64 fd181, fd140, fd156;
add.f64 fd182, fd142, fd158;
sub.f64 fd183, fd140, fd156;
sub.f64 fd184, fd142, fd158;
add.f64 fd185, fd147, fd173;
add.f64 fd186, fd153, fd175;
sub.f64 fd187, fd147, fd173;
sub.f64 fd188, fd153, fd175;
add.f64 fd189, fd148, fd178;
add.f64 fd190, fd154, fd180;
sub.f64 fd191, fd148, fd178;
sub.f64 fd192, fd154, fd180;
mul.wide.u32 rd7, r7, -1431655765;
shr.u64 rd8, rd7, 34;
cvt.u32.u64 r11, rd8;
mul.lo.s32 r12, r11, 6;
sub.s32 r13, r7, r12;
mul.wide.u32 rd9, r11, 16;
mov.u64 rd10, %14;
add.s64 rd11, rd10, rd9;
ld.global.v2.f64 {fd193, fd194}, [rd11];
mul.f64 fd197, fd193, fd185;
mul.f64 fd198, fd194, fd186;
sub.f64 fd199, fd197, fd198;
mul.f64 fd200, fd193, fd186;
fma.rn.f64 fd201, fd194, fd185, fd200;
mul.f64 fd202, fd193, fd193;
mul.f64 fd203, fd194, fd194;
sub.f64 fd204, fd202, fd203;
mul.f64 fd205, fd194, fd193;
fma.rn.f64 fd206, fd194, fd193, fd205;
mul.f64 fd207, fd204, fd189;
mul.f64 fd208, fd206, fd190;
sub.f64 fd209, fd207, fd208;
mul.f64 fd210, fd204, fd190;
fma.rn.f64 fd211, fd206, fd189, fd210;
mul.f64 fd212, fd193, fd204;
mul.f64 fd213, fd194, fd206;
sub.f64 fd214, fd212, fd213;
mul.f64 fd215, fd193, fd206;
fma.rn.f64 fd216, fd194, fd204, fd215;
mul.f64 fd217, fd214, fd183;
mul.f64 fd218, fd216, fd184;
sub.f64 fd219, fd217, fd218;
mul.f64 fd220, fd214, fd184;
fma.rn.f64 fd221, fd216, fd183, fd220;
ld.global.v2.f64 {fd222, fd223}, [rd11+96];
mul.f64 fd226, fd222, fd187;
mul.f64 fd227, fd223, fd188;
sub.f64 fd228, fd226, fd227;
mul.f64 fd229, fd222, fd188;
fma.rn.f64 fd230, fd223, fd187, fd229;
mul.f64 fd231, fd193, fd222;
mul.f64 fd232, fd194, fd223;
sub.f64 fd233, fd231, fd232;
mul.f64 fd234, fd193, fd223;
fma.rn.f64 fd235, fd194, fd222, fd234;
mul.f64 fd236, fd233, fd191;
mul.f64 fd237, fd235, fd192;
sub.f64 fd238, fd236, fd237;
mul.f64 fd239, fd233, fd192;
fma.rn.f64 fd240, fd235, fd191, fd239;
shl.b32 r14, r13, 3;
add.s32 r15, r8, r14;
barrier.sync 0;
mad.lo.s32 r16, r11, 288, r15;
st.shared.f64 [r16], fd181;
st.shared.f64 [r16+48], fd199;
st.shared.f64 [r16+96], fd209;
st.shared.f64 [r16+144], fd219;
st.shared.f64 [r16+192], fd228;
st.shared.f64 [r16+240], fd238;
barrier.sync 0;
ld.shared.f64 fd241, [r10];
ld.shared.f64 fd242, [r10+288];
ld.shared.f64 fd243, [r10+576];
ld.shared.f64 fd244, [r10+864];
ld.shared.f64 fd245, [r10+1152];
ld.shared.f64 fd246, [r10+1440];
barrier.sync 0;
st.shared.f64 [r16], fd182;
st.shared.f64 [r16+48], fd201;
st.shared.f64 [r16+96], fd211;
st.shared.f64 [r16+144], fd221;
st.shared.f64 [r16+192], fd230;
st.shared.f64 [r16+240], fd240;
barrier.sync 0;
ld.shared.f64 fd247, [r10];
ld.shared.f64 fd248, [r10+288];
ld.shared.f64 fd249, [r10+576];
ld.shared.f64 fd250, [r10+864];
ld.shared.f64 fd251, [r10+1152];
ld.shared.f64 fd252, [r10+1440];
add.f64 fd253, fd243, fd245;
add.f64 fd254, fd241, fd253;
add.f64 fd255, fd249, fd251;
add.f64 fd256, fd247, fd255;
mul.f64 fd257, fd253, 0d3FE0000000000000;
sub.f64 fd258, fd241, fd257;
sub.f64 fd259, fd249, fd251;
mul.f64 fd260, fd259, 0d3FEBB67AE8584CAA;
add.f64 fd261, fd260, fd258;
sub.f64 fd262, fd258, fd260;
mul.f64 fd263, fd255, 0d3FE0000000000000;
sub.f64 fd264, fd247, fd263;
sub.f64 fd265, fd243, fd245;
mul.f64 fd266, fd265, 0d3FEBB67AE8584CAA;
sub.f64 fd267, fd264, fd266;
add.f64 fd268, fd266, fd264;
add.f64 fd269, fd244, fd246;
add.f64 fd270, fd242, fd269;
add.f64 fd271, fd250, fd252;
add.f64 fd272, fd248, fd271;
mul.f64 fd273, fd269, 0d3FE0000000000000;
sub.f64 fd274, fd242, fd273;
sub.f64 fd275, fd250, fd252;
mul.f64 fd276, fd275, 0d3FEBB67AE8584CAA;
add.f64 fd277, fd276, fd274;
sub.f64 fd278, fd274, fd276;
mul.f64 fd279, fd271, 0d3FE0000000000000;
sub.f64 fd280, fd248, fd279;
sub.f64 fd281, fd244, fd246;
mul.f64 fd282, fd281, 0d3FEBB67AE8584CAA;
sub.f64 fd283, fd280, fd282;
add.f64 fd284, fd282, fd280;
mul.f64 fd285, fd277, 0d3FE0000000000000;
mul.f64 fd286, fd283, 0dBFEBB67AE8584CAA;
sub.f64 fd287, fd285, fd286;
mul.f64 fd288, fd283, 0d3FE0000000000000;
fma.rn.f64 fd289, fd277, 0dBFEBB67AE8584CAA, fd288;
mul.f64 fd290, fd278, 0dBFE0000000000000;
mul.f64 fd291, fd284, 0dBFEBB67AE8584CAA;
sub.f64 fd292, fd290, fd291;
mul.f64 fd293, fd284, 0dBFE0000000000000;
fma.rn.f64 fd294, fd278, 0dBFEBB67AE8584CAA, fd293;
add.f64 %0, fd254, fd270;
add.f64 %1, fd256, fd272;
add.f64 %3, fd267, fd289;
add.f64 %2, fd261, fd287;
add.f64 %5, fd268, fd294;
add.f64 %4, fd262, fd292;
sub.f64 %6, fd254, fd270;
sub.f64 %7, fd256, fd272;
sub.f64 %9, fd267, fd289;
sub.f64 %8, fd261, fd287;
sub.f64 %11, fd268, fd294;
sub.f64 %10, fd262, fd292;
})"
     : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y): "r"(smem), "l"(lut_dp_6_216), "l"(lut_dp_6_36), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<562, double, 1>(cufftdx::detail::complex<double> *rmem, unsigned smem){

asm volatile (R"({
.reg .b32 r<17>;
.reg .f64 fd<331>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
mov.u32 r2, %12;
mad.lo.s32 r3, r1, 3456, r2;
mov.u32 r4, %tid.x;
add.f64 fd25, %20, %25;
add.f64 fd26, %15, fd25;
add.f64 fd27, %22, %27;
add.f64 fd28, %16, fd27;
mul.f64 fd29, fd25, 0d3FE0000000000000;
sub.f64 fd30, %15, fd29;
sub.f64 fd31, %22, %27;
mul.f64 fd32, fd31, 0d3FEBB67AE8584CAA;
add.f64 fd33, fd32, fd30;
sub.f64 fd34, fd30, fd32;
mul.f64 fd35, fd27, 0d3FE0000000000000;
sub.f64 fd36, %16, fd35;
sub.f64 fd37, %20, %25;
mul.f64 fd38, fd37, 0d3FEBB67AE8584CAA;
sub.f64 fd39, fd36, fd38;
add.f64 fd40, fd38, fd36;
add.f64 fd41, %23, %28;
add.f64 fd42, %17, fd41;
add.f64 fd43, %24, %29;
add.f64 fd44, %19, fd43;
mul.f64 fd45, fd41, 0d3FE0000000000000;
sub.f64 fd46, %17, fd45;
sub.f64 fd47, %24, %29;
mul.f64 fd48, fd47, 0d3FEBB67AE8584CAA;
add.f64 fd49, fd48, fd46;
sub.f64 fd50, fd46, fd48;
mul.f64 fd51, fd43, 0d3FE0000000000000;
sub.f64 fd52, %19, fd51;
sub.f64 fd53, %23, %28;
mul.f64 fd54, fd53, 0d3FEBB67AE8584CAA;
sub.f64 fd55, fd52, fd54;
add.f64 fd56, fd54, fd52;
mul.f64 fd57, fd49, 0d3FE0000000000000;
mul.f64 fd58, fd55, 0dBFEBB67AE8584CAA;
sub.f64 fd59, fd57, fd58;
mul.f64 fd60, fd55, 0d3FE0000000000000;
fma.rn.f64 fd61, fd49, 0dBFEBB67AE8584CAA, fd60;
mul.f64 fd62, fd50, 0dBFE0000000000000;
mul.f64 fd63, fd56, 0dBFEBB67AE8584CAA;
sub.f64 fd64, fd62, fd63;
mul.f64 fd65, fd56, 0dBFE0000000000000;
fma.rn.f64 fd66, fd50, 0dBFEBB67AE8584CAA, fd65;
sub.f64 fd67, fd26, fd42;
sub.f64 fd68, fd28, fd44;
add.f64 fd69, fd33, fd59;
add.f64 fd70, fd39, fd61;
sub.f64 fd71, fd33, fd59;
sub.f64 fd72, fd39, fd61;
add.f64 fd73, fd34, fd64;
add.f64 fd74, fd40, fd66;
sub.f64 fd75, fd34, fd64;
sub.f64 fd76, fd40, fd66;
mul.wide.u32 rd2, r4, 954437177;
shr.u64 rd3, rd2, 35;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 36;
sub.s32 r7, r4, r6;
mad.lo.s32 r8, r5, 3456, r3;
mul.wide.u32 rd4, r7, 16;
mov.u64 rd5, %13;
add.s64 rd6, rd5, rd4;
ld.global.v2.f64 {fd77, fd78}, [rd6];
mul.f64 fd81, fd77, fd69;
mul.f64 fd82, fd78, fd70;
mul.f64 fd83, fd77, fd70;
mul.f64 fd84, fd77, fd77;
mul.f64 fd85, fd78, fd78;
sub.f64 fd86, fd84, fd85;
mul.f64 fd87, fd78, fd77;
fma.rn.f64 fd88, fd78, fd77, fd87;
mul.f64 fd89, fd86, fd73;
mul.f64 fd90, fd88, fd74;
mul.f64 fd91, fd86, fd74;
mul.f64 fd92, fd77, fd86;
mul.f64 fd93, fd78, fd88;
sub.f64 fd94, fd92, fd93;
mul.f64 fd95, fd77, fd88;
fma.rn.f64 fd96, fd78, fd86, fd95;
mul.f64 fd97, fd94, fd67;
mul.f64 fd98, fd96, fd68;
mul.f64 fd99, fd94, fd68;
ld.global.v2.f64 {fd100, fd101}, [rd6+576];
mul.f64 fd104, fd100, fd71;
mul.f64 fd105, fd101, fd72;
mul.f64 fd106, fd100, fd72;
mul.f64 fd107, fd77, fd100;
mul.f64 fd108, fd78, fd101;
sub.f64 fd109, fd107, fd108;
mul.f64 fd110, fd77, fd101;
fma.rn.f64 fd111, fd78, fd100, fd110;
mul.f64 fd112, fd109, fd75;
mul.f64 fd113, fd111, fd76;
mul.f64 fd114, fd109, fd76;
barrier.sync 0;
mad.lo.s32 r9, r7, 96, r8;
add.f64 fd115, fd28, fd44;
add.f64 fd116, fd26, fd42;
st.shared.v2.f64 [r9], {fd116, fd115};
fma.rn.f64 fd117, fd78, fd69, fd83;
sub.f64 fd118, fd81, fd82;
st.shared.v2.f64 [r9+16], {fd118, fd117};
fma.rn.f64 fd119, fd88, fd73, fd91;
sub.f64 fd120, fd89, fd90;
st.shared.v2.f64 [r9+32], {fd120, fd119};
fma.rn.f64 fd121, fd96, fd67, fd99;
sub.f64 fd122, fd97, fd98;
st.shared.v2.f64 [r9+48], {fd122, fd121};
fma.rn.f64 fd123, fd101, fd71, fd106;
sub.f64 fd124, fd104, fd105;
st.shared.v2.f64 [r9+64], {fd124, fd123};
fma.rn.f64 fd125, fd111, fd75, fd114;
sub.f64 fd126, fd112, fd113;
st.shared.v2.f64 [r9+80], {fd126, fd125};
barrier.sync 0;
mad.lo.s32 r10, r7, -80, r9;
ld.shared.v2.f64 {fd127, fd128}, [r10];
ld.shared.v2.f64 {fd131, fd132}, [r10+576];
ld.shared.v2.f64 {fd135, fd136}, [r10+1152];
ld.shared.v2.f64 {fd139, fd140}, [r10+1728];
ld.shared.v2.f64 {fd143, fd144}, [r10+2304];
ld.shared.v2.f64 {fd147, fd148}, [r10+2880];
add.f64 fd151, fd135, fd143;
add.f64 fd152, fd127, fd151;
add.f64 fd153, fd136, fd144;
add.f64 fd154, fd128, fd153;
mul.f64 fd155, fd151, 0d3FE0000000000000;
sub.f64 fd156, fd127, fd155;
sub.f64 fd157, fd136, fd144;
mul.f64 fd158, fd157, 0d3FEBB67AE8584CAA;
add.f64 fd159, fd158, fd156;
sub.f64 fd160, fd156, fd158;
mul.f64 fd161, fd153, 0d3FE0000000000000;
sub.f64 fd162, fd128, fd161;
sub.f64 fd163, fd135, fd143;
mul.f64 fd164, fd163, 0d3FEBB67AE8584CAA;
sub.f64 fd165, fd162, fd164;
add.f64 fd166, fd164, fd162;
add.f64 fd167, fd139, fd147;
add.f64 fd168, fd131, fd167;
add.f64 fd169, fd140, fd148;
add.f64 fd170, fd132, fd169;
mul.f64 fd171, fd167, 0d3FE0000000000000;
sub.f64 fd172, fd131, fd171;
sub.f64 fd173, fd140, fd148;
mul.f64 fd174, fd173, 0d3FEBB67AE8584CAA;
add.f64 fd175, fd174, fd172;
sub.f64 fd176, fd172, fd174;
mul.f64 fd177, fd169, 0d3FE0000000000000;
sub.f64 fd178, fd132, fd177;
sub.f64 fd179, fd139, fd147;
mul.f64 fd180, fd179, 0d3FEBB67AE8584CAA;
sub.f64 fd181, fd178, fd180;
add.f64 fd182, fd180, fd178;
mul.f64 fd183, fd175, 0d3FE0000000000000;
mul.f64 fd184, fd181, 0dBFEBB67AE8584CAA;
sub.f64 fd185, fd183, fd184;
mul.f64 fd186, fd181, 0d3FE0000000000000;
fma.rn.f64 fd187, fd175, 0dBFEBB67AE8584CAA, fd186;
mul.f64 fd188, fd176, 0dBFE0000000000000;
mul.f64 fd189, fd182, 0dBFEBB67AE8584CAA;
sub.f64 fd190, fd188, fd189;
mul.f64 fd191, fd182, 0dBFE0000000000000;
fma.rn.f64 fd192, fd176, 0dBFEBB67AE8584CAA, fd191;
sub.f64 fd193, fd152, fd168;
sub.f64 fd194, fd154, fd170;
add.f64 fd195, fd159, fd185;
add.f64 fd196, fd165, fd187;
sub.f64 fd197, fd159, fd185;
sub.f64 fd198, fd165, fd187;
add.f64 fd199, fd160, fd190;
add.f64 fd200, fd166, fd192;
sub.f64 fd201, fd160, fd190;
sub.f64 fd202, fd166, fd192;
mul.wide.u32 rd7, r7, -1431655765;
shr.u64 rd8, rd7, 34;
cvt.u32.u64 r11, rd8;
mul.lo.s32 r12, r11, 6;
sub.s32 r13, r7, r12;
mul.wide.u32 rd9, r11, 16;
mov.u64 rd10, %14;
add.s64 rd11, rd10, rd9;
ld.global.v2.f64 {fd203, fd204}, [rd11];
mul.f64 fd207, fd203, fd195;
mul.f64 fd208, fd204, fd196;
mul.f64 fd209, fd203, fd196;
mul.f64 fd210, fd203, fd203;
mul.f64 fd211, fd204, fd204;
sub.f64 fd212, fd210, fd211;
mul.f64 fd213, fd204, fd203;
fma.rn.f64 fd214, fd204, fd203, fd213;
mul.f64 fd215, fd212, fd199;
mul.f64 fd216, fd214, fd200;
mul.f64 fd217, fd212, fd200;
mul.f64 fd218, fd203, fd212;
mul.f64 fd219, fd204, fd214;
sub.f64 fd220, fd218, fd219;
mul.f64 fd221, fd203, fd214;
fma.rn.f64 fd222, fd204, fd212, fd221;
mul.f64 fd223, fd220, fd193;
mul.f64 fd224, fd222, fd194;
mul.f64 fd225, fd220, fd194;
ld.global.v2.f64 {fd226, fd227}, [rd11+96];
mul.f64 fd230, fd226, fd197;
mul.f64 fd231, fd227, fd198;
mul.f64 fd232, fd226, fd198;
mul.f64 fd233, fd203, fd226;
mul.f64 fd234, fd204, fd227;
sub.f64 fd235, fd233, fd234;
mul.f64 fd236, fd203, fd227;
fma.rn.f64 fd237, fd204, fd226, fd236;
mul.f64 fd238, fd235, fd201;
mul.f64 fd239, fd237, fd202;
mul.f64 fd240, fd235, fd202;
shl.b32 r14, r13, 4;
add.s32 r15, r8, r14;
barrier.sync 0;
mad.lo.s32 r16, r11, 576, r15;
add.f64 fd241, fd154, fd170;
add.f64 fd242, fd152, fd168;
st.shared.v2.f64 [r16], {fd242, fd241};
fma.rn.f64 fd243, fd204, fd195, fd209;
sub.f64 fd244, fd207, fd208;
st.shared.v2.f64 [r16+96], {fd244, fd243};
fma.rn.f64 fd245, fd214, fd199, fd217;
sub.f64 fd246, fd215, fd216;
st.shared.v2.f64 [r16+192], {fd246, fd245};
fma.rn.f64 fd247, fd222, fd193, fd225;
sub.f64 fd248, fd223, fd224;
st.shared.v2.f64 [r16+288], {fd248, fd247};
fma.rn.f64 fd249, fd227, fd197, fd232;
sub.f64 fd250, fd230, fd231;
st.shared.v2.f64 [r16+384], {fd250, fd249};
fma.rn.f64 fd251, fd237, fd201, fd240;
sub.f64 fd252, fd238, fd239;
st.shared.v2.f64 [r16+480], {fd252, fd251};
barrier.sync 0;
ld.shared.v2.f64 {fd253, fd254}, [r10];
ld.shared.v2.f64 {fd257, fd258}, [r10+576];
ld.shared.v2.f64 {fd261, fd262}, [r10+1152];
ld.shared.v2.f64 {fd265, fd266}, [r10+1728];
ld.shared.v2.f64 {fd269, fd270}, [r10+2304];
ld.shared.v2.f64 {fd273, fd274}, [r10+2880];
add.f64 fd277, fd261, fd269;
add.f64 fd278, fd253, fd277;
add.f64 fd279, fd262, fd270;
add.f64 fd280, fd254, fd279;
mul.f64 fd281, fd277, 0d3FE0000000000000;
sub.f64 fd282, fd253, fd281;
sub.f64 fd283, fd262, fd270;
mul.f64 fd284, fd283, 0d3FEBB67AE8584CAA;
add.f64 fd285, fd284, fd282;
sub.f64 fd286, fd282, fd284;
mul.f64 fd287, fd279, 0d3FE0000000000000;
sub.f64 fd288, fd254, fd287;
sub.f64 fd289, fd261, fd269;
mul.f64 fd290, fd289, 0d3FEBB67AE8584CAA;
sub.f64 fd291, fd288, fd290;
add.f64 fd292, fd290, fd288;
add.f64 fd293, fd265, fd273;
add.f64 fd294, fd257, fd293;
add.f64 fd295, fd266, fd274;
add.f64 fd296, fd258, fd295;
mul.f64 fd297, fd293, 0d3FE0000000000000;
sub.f64 fd298, fd257, fd297;
sub.f64 fd299, fd266, fd274;
mul.f64 fd300, fd299, 0d3FEBB67AE8584CAA;
add.f64 fd301, fd300, fd298;
sub.f64 fd302, fd298, fd300;
mul.f64 fd303, fd295, 0d3FE0000000000000;
sub.f64 fd304, fd258, fd303;
sub.f64 fd305, fd265, fd273;
mul.f64 fd306, fd305, 0d3FEBB67AE8584CAA;
sub.f64 fd307, fd304, fd306;
add.f64 fd308, fd306, fd304;
mul.f64 fd309, fd301, 0d3FE0000000000000;
mul.f64 fd310, fd307, 0dBFEBB67AE8584CAA;
sub.f64 fd311, fd309, fd310;
mul.f64 fd312, fd307, 0d3FE0000000000000;
fma.rn.f64 fd313, fd301, 0dBFEBB67AE8584CAA, fd312;
mul.f64 fd314, fd302, 0dBFE0000000000000;
mul.f64 fd315, fd308, 0dBFEBB67AE8584CAA;
sub.f64 fd316, fd314, fd315;
mul.f64 fd317, fd308, 0dBFE0000000000000;
fma.rn.f64 fd318, fd302, 0dBFEBB67AE8584CAA, fd317;
add.f64 %1, fd280, fd296;
add.f64 %0, fd278, fd294;
add.f64 %3, fd291, fd313;
add.f64 %2, fd285, fd311;
add.f64 %5, fd292, fd318;
add.f64 %4, fd286, fd316;
sub.f64 %7, fd280, fd296;
sub.f64 %6, fd278, fd294;
sub.f64 %9, fd291, fd313;
sub.f64 %8, fd285, fd311;
sub.f64 %11, fd292, fd318;
sub.f64 %10, fd286, fd316;
})"
     : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y): "r"(smem), "l"(lut_dp_6_216), "l"(lut_dp_6_36), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y));
};


#endif
